logo

Project Summary

The aim of this project is to investigate the association between AF and lifestyle risk factor and develop a lifestyle risk score to predict AF.

library(data.table)
library(ggplot2)
library(tidyverse)
library(ggthemes)
library(cowplot)
library(fastDummies)
library(flextable)
library(officer)
library(corrplot)
library(glmnet)
library(pROC)
library(rms) #for nomogram
library(knitr) #for include_graphics
library(umap) #umap plots
library(cutpointr) #ROC cutpoint analysis
library(MLmetrics) #to do logloss/f1 scores over glm model
library(DescTools) #to compute Brier score
library(riskRegression) # to do calibrarion plots
  • Dividing the Data

We randomly divided the filtered data into a testing (70%) and validation set (30%).

validating_score <- function(dataset, model, label) {
  x <- computeScores(dataset, model)
  lr <- glm("AF_Status ~ scale(score)", family = binomial, data = x)
  lr_or <- get_ors(lr)
  #AUC
  #AUC(y_pred = lr$fitted.values, y_true = x$AF_Status) #using MLmetrics
  pred <- predict(lr, x)
  ro <- pROC::roc(x$AF_Status, pred)
  au <- as.numeric(pROC::auc(ro))
  ci <- as.numeric(pROC::ci(ro))
  auc_l <- format(round(ci[1], 3), nsmall = 3)
  auc_h <- format(round(ci[3], 3), nsmall = 3)
  auc <- format(round(au, 3), nsmall = 3)
  #Logloss
  ll <- LogLoss(y_pred = lr$fitted.values, y_true=x$AF_Status)
  ll <- format(round(ll, 3), nsmall = 3)
  #Brier score
  br <- BrierScore(lr)
  br <- format(round(br, 3), nsmall = 3)
    auc_info <- c(label, paste0(auc, " (", auc_l, ", ", auc_h, ")"), ll, br)
  return(list(auc_info, ro))
}

validating_manual_score <- function(dataset) {
    #create columns for MODIFY models
  dataset$age_65_74 <- ifelse(dataset$age >= 65 & dataset$age < 75, 1, 0)
  dataset$age_75 <- ifelse(dataset$age > 74, 1, 0)
  dataset$age_60_74 <- ifelse(dataset$age >= 60 & dataset$age < 75, 1, 0)
  dataset$age_60_64 <- ifelse(dataset$age >= 60 & dataset$age < 64, 1, 0)
  dataset$age_65 <- ifelse(dataset$age > 64, 1, 0)
  dataset$Alcohol_6_15 <- ifelse(dataset$Alcohol_sdd > 5 & dataset$Alcohol_sdd <= 15, 1, 0)
  dataset$Alcohol_16_50 <- ifelse(dataset$Alcohol_sdd > 15, 1, 0)
    model_au_or <- list()
    for(i in 1:length(MODIFYAF)) {
        x <- computeScores(dataset, MODIFYAF[[i]])
        lr <- glm("AF_Status ~ scale(score)", family = binomial, data = x)
        lr_or <- get_ors(lr)
        #AUC
        pred <- predict(lr, x)
        ro <- pROC::roc(x$AF_Status, pred)
        au <- as.numeric(pROC::auc(ro))
        ci <- as.numeric(pROC::ci(ro))
        auc_l <- format(round(ci[1], 3), nsmall = 3)
        auc_h <- format(round(ci[3], 3), nsmall = 3)
        auc <- format(round(au, 3), nsmall = 3)
        #Logloss
        ll <- LogLoss(y_pred = lr$fitted.values, y_true=x$AF_Status)
        ll <- format(round(ll, 3), nsmall = 3)
        #Brier score
        br <- BrierScore(lr)
        br <- format(round(br, 3), nsmall = 3)
        auc_info <- c(paste0("MODIFYAF ", i), 
                                    paste0(auc, " (", auc_l, ", ", auc_h, ")"), ll, br)
        model_au_or[[i]] <- list(auc_info, ro)
    }
  return(model_au_or)
}


plot_multiroc <- function(l_rocs, l_labels) {
    roc_data <- data.frame(sensitiviy = character(), 
                                                 specificity = character(), 
                                                 Model = character(),
                                                 stringsAsFactors=FALSE)
    for(i in 1:length(l_rocs)) {
        x <- data.frame(sensitiviy = l_rocs[[i]]$sensitivities,
                                        specificity = l_rocs[[i]]$specificities, stringsAsFactors=FALSE)
        #Get AUC
        au <- as.numeric(pROC::auc(l_rocs[[i]])) * 100
        au <- format(round(au, 1), nsmall = 1)
        x$Model = paste0(l_labels[[i]], " (AUC: ", au, "%)")
        roc_data <- rbind(roc_data, x)
    }
    pl <- ggplot(data=roc_data, aes(x=1-specificity, y=sensitiviy, color=Model)) + 
        geom_line(size=2) + theme_bw() +    
        scale_x_continuous(name="1 - Specificity") +
        scale_y_continuous(name="Sensitivity") +
        theme(legend.position = c(0.8, 0.2), 
                    legend.margin = margin(0.2, 0.2, 0.2, 0.2, "cm"),
                    legend.text = element_text(size=12),
                    legend.title = element_text(size=14),
                    legend.background = element_blank(),
                    legend.box.background = element_rect(colour = "black"))
    return(pl)
}

mod_hist_plot <- function(dataset) {
  pdata <- dataset %>%
    group_by(score) %>%  tally() %>%
    mutate(per=round(prop.table(n) * 100, 2))
  af_data <- dataset[dataset$AF_Status==1, ] %>%
    group_by(score) %>%  tally() %>%
    mutate(per=round(prop.table(n) * 100, 2))
  naf_data <- dataset[dataset$AF_Status==0, ] %>%
    group_by(score) %>%  tally() %>%
    mutate(per=round(prop.table(n) * 100, 2))
  d <- list(pdata, af_data, naf_data)
  color <- c("steelblue", "#e34a33", "#a1d99b")
  ys <- c("Individuals (%)", "AF Individuals (%)", "No AF Individuals (%)")
  pl <- list()

  for(i in 1:length(d)) {
    p <- ggplot(d[[i]], aes(x=score, y=per)) +
      geom_bar(stat="identity", fill=color[i]) +
      labs(x="MODIFY_AF risk score", y= ys[i]) +
      theme_minimal()
    pl[[i]] <- p
  }
  pl <- plot_grid(pl[[1]], pl[[2]], pl[[3]], ncol=1)
  return(pl)
}

mod_table_ind <- function(dataset) {
  pdata <- dataset %>%
    group_by(score) %>%  tally() 
  af_data <- dataset[dataset$AF_Status==1, ] %>%
    group_by(score) %>%  tally() 
    colnames(af_data) <- c('score','AF')
  pdata <- merge(pdata, af_data, by='score')
    naf_data <- dataset[dataset$AF_Status==0, ] %>%
    group_by(score) %>%  tally()
    colnames(naf_data) <- c('score','NoAF')
    pdata <- merge(pdata, naf_data, by='score')

    tab_pr <- flextable(pdata) %>%
    width(j=1, width = 3) %>% #width of each column
    width(j=c(2:4), width = 2)
  tab_pr <- tab_pr %>% set_header_labels(Score="Score", n="Total Individuals",
                                                                                 AF="AF cases", NoAF="Controls")
  # alignment
  tab_pr <- tab_pr %>% flextable::align(align = "center", j = c(2:4), part = "all")
  # font style
  tab_pr <- tab_pr %>%
    style(pr_t=fp_text(font.family= "roboto")) %>%
    fontsize(i = 1, size = 12, part = "header") %>%   # adjust font size of header
    bold(i = 1, bold = TRUE, part = "header") %>%     # adjust bold face of header
    fontsize(part="body", size = 11) %>%
    bold(j = 1, bold = TRUE, part = "body")
    return(tab_pr)
}

Demographic Data

Datasets

Test Cohort

mytab <- NULL
{
  t <- generate_table(test_ss)
  mytab <- demo_table(t)
    mytab <- mytab %>% bold(i = c(1:12,17,21,25), j = 1, bold = TRUE, part = "body")
}
mytab

Validation Cohort

mytab <- NULL
{
  t <- generate_table(validation_ss)
  mytab <- demo_table(t)
    mytab <- mytab %>% bold(i = c(1:12,17,21,25), j = 1, bold = TRUE, part = "body")
}
mytab

Time to AF Diagnosis Plots

Test Cohort

time_to_af_plot(test_ss)

Validation Cohort

time_to_af_plot(validation_ss)

Life Style Parameters

In this work we considered the following variables:

  • Sex (Ref: Female)
  • Age (Continuous variable) Note that in our cohort the maximum age is 73
  • BMI (continuous value)
  • Obesity (Low, Medium, High or equivalent to bmi<27, 27<=bmi<=30, bmi>30) (Ref: Low)
  • Smoking (Never, Previous, Current) (Ref: Never)
  • Alcohol (Continuous variable) Number of standard drinks per week (stdw)
  • Hypertension (Yes/No) (Ref: No)
  • Diabetes (Yes/No) (Ref: No)
  • Sleep apnoea (Yes/No) (Ref: No)
  • Physical inactivity (Yes/No) (Ref: Yes)

From this point we will perform all the analysis over the Test cohort and will only use the validation cohort at the end to validate the model generated.

Density plots and Cutoff

BMI

test_ss$AF <- as.factor(test_ss$AF_Status)
summary_continuous(indata=test_ss, field="bmi",type="num",xlab="BMI (Kg/m2)",
                   leg_pos=c(0.9,0.7), fillvar = "AF", fillname = "AF Status",
                   rel_heights = c(5.5,4.5,3,2.5), boxcol="white") #xlim=c(min_t, max_t)

  • Cutoff point
cp <- cutpointr(data=test_ss, x=bmi, class=AF,
                method=maximize_metric, metric=youden)
r <- plot_roc(cp)
y <- plot_metric(cp)
y <- y +
  geom_vline(aes(xintercept = cp$optimal_cutpoint, color="red")) +
  geom_text(aes(cp$optimal_cutpoint, 0, label = cp$optimal_cutpoint, hjust=-1, color="red")) +
  theme(legend.position="none")
plot_grid(r,y)

Age

summary_continuous(indata=test_ss, field="age_at_recruitment_f21022_0_0",
                   type="num", xlab="Age (years)",
                   leg_pos=c(0.9,0.7), fillvar = "AF", fillname = "AF Status",
                   rel_heights = c(5.5,4.5,3,2.5), boxcol="white") #xlim=c(min_t, max_t)

  • Cutoff point
cp <- cutpointr(data=test_ss, x=age, class=AF,
                method=maximize_metric, metric=youden)
r <- plot_roc(cp)
y <- plot_metric(cp)
y <- y +
  geom_vline(aes(xintercept = cp$optimal_cutpoint, color="red")) +
  geom_text(aes(cp$optimal_cutpoint, 0, label = cp$optimal_cutpoint, hjust=-1, color="red")) +
  theme(legend.position="none")
plot_grid(r,y)

Correlation Plot

#  Correlation plot
M <- cor(test_ss[,c('Sex', 'age', 'Diabetes', 'Sleep_apnoea','Hypertension', 'AF_Status',
                   'Smoking_Never','Smoking_Previous', 'Smoking_Current',
                   'Alcohol_sdd',
                   'Obesity_Low', 'Obesity_Medium', 'Obesity_High',
                                     'Physical_inactivity')])
order.M <- corrMatOrder(M, order = "AOE")
M <-  M[order.M,order.M]
pl <- corrplot(M,
               col=rev(colorRampPalette(pal_col)(10)),
               method="circle",
                type = "upper",
               tl.pos = "td",
               tl.cex = 0.5,
               diag = FALSE,
               tl.col="black")

pl

Clustering Plots

Using the lifestyle parameters (sex, age, obesity, smoking, alcoho, diabtes, sleep apnoe, and physical inactivity) and UMAP (Uniform Manifold Approximation) we explored how the samples clustered depending on their age group, AF status, and sex.

SEX

Age Group

AF

Univariable Analysis

We tested the predictive power of each of the lifestyle variables independently with respect the AF Control/Cases in the test cohort. The results are based on logistic regression on the test sets without any adjusting and effect sizes are per standard deviation.

Odd Ratio for each lifestyle variable

plot_or_data(or_data[[2]])

Table

tab_or
 




A work by Baker Bioinformatics